{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import copy\n",
    "import glob\n",
    "import pandas as pd\n",
    "import math\n",
    "import numpy as np\n",
    "import os\n",
    "import scipy.io as sio\n",
    "\n",
    "from PIL import Image\n",
    "\n",
    "pd.set_option('display.max_columns', None)\n",
    "pd.set_option('display.max_colwidth', -1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# The order matters, don't touch these #\n",
    "previous_attributes = ['personalLess30', 'personalLess45', 'personalLess60', 'personalLarger60', 'carryingBackpack', \\\n",
    "                       'carryingOther', 'lowerBodyCasual', 'upperBodyCasual', 'lowerBodyFormal', 'upperBodyFormal', \\\n",
    "                       'accessoryHat', 'upperBodyJacket', 'lowerBodyJeans', 'footwearLeatherShoes', 'upperBodyLogo', \\\n",
    "                       'hairLong', 'personalMale', 'carryingMessengerBag', 'accessoryMuffler', 'accessoryNothing', \\\n",
    "                       'carryingNothing', 'upperBodyPlaid', 'carryingPlasticBags', 'footwearSandals', 'footwearShoes', \\\n",
    "                       'lowerBodyShorts', 'upperBodyShortSleeve', 'lowerBodyShortSkirt', 'footwearSneaker', \\\n",
    "                       'upperBodyThinStripes', 'accessorySunglasses', 'lowerBodyTrousers', 'upperBodyTshirt', \\\n",
    "                       'upperBodyOther', 'upperBodyVNeck', 'upperBodyBlack', 'upperBodyBlue', 'upperBodyBrown', \\\n",
    "                       'upperBodyGreen', 'upperBodyGrey', 'upperBodyOrange', 'upperBodyPink', 'upperBodyPurple', \\\n",
    "                       'upperBodyRed', 'upperBodyWhite', 'upperBodyYellow', 'lowerBodyBlack', 'lowerBodyBlue', \\\n",
    "                       'lowerBodyBrown', 'lowerBodyGreen', 'lowerBodyGrey', 'lowerBodyOrange', 'lowerBodyPink', \\\n",
    "                       'lowerBodyPurple', 'lowerBodyRed', 'lowerBodyWhite', 'lowerBodyYellow', 'hairBlack', 'hairBlue', \\\n",
    "                       'hairBrown', 'hairGreen', 'hairGrey', 'hairOrange', 'hairPink', 'hairPurple', 'hairRed', \\\n",
    "                       'hairWhite', 'hairYellow', 'footwearBlack', 'footwearBlue', 'footwearBrown', 'footwearGreen', \\\n",
    "                       'footwearGrey', 'footwearOrange', 'footwearPink', 'footwearPurple', 'footwearRed', 'footwearWhite', \\\n",
    "                       'footwearYellow', 'accessoryHeadphone', 'personalLess15', 'carryingBabyBuggy', 'hairBald', \\\n",
    "                       'footwearBoots', 'lowerBodyCapri', 'carryingShoppingTro', 'carryingUmbrella', 'personalFemale', \\\n",
    "                       'carryingFolder', 'accessoryHairBand', 'lowerBodyHotPants', 'accessoryKerchief', \\\n",
    "                       'lowerBodyLongSkirt', 'upperBodyLongSleeve', 'lowerBodyPlaid', 'lowerBodyThinStripes', \\\n",
    "                       'carryingLuggageCase', 'upperBodyNoSleeve', 'hairShort', 'footwearStocking', 'upperBodySuit', \\\n",
    "                       'carryingSuitcase', 'lowerBodySuits', 'upperBodySweater', 'upperBodyThickStripes']\n",
    "\n",
    "\n",
    "added_attributes = ['carryingBlack', 'carryingBlue', 'carryingBrown', 'carryingGreen', 'carryingGrey', \\\n",
    "                'carryingOrange', 'carryingPink', 'carryingPurple', 'carryingRed', 'carryingWhite', 'carryingYellow']\n",
    "\n",
    "PETA_folders = [\"3DPeS\", \"CAVIAR4REID\", \"CUHK\", \"GRID\", \"i-LID\", \"MIT\", \"PRID\", \"SARC3D\", \"TownCentre\", \"VIPeR\"]\n",
    "# The order matters, don't touch these #\n",
    "\n",
    "\n",
    "# Change here #\n",
    "labels_text_filepaths = \"./Updated_Labels/\"\n",
    "peta_dataset_filepaths = \"./PETA dataset/\"\n",
    "\n",
    "peta_images_output_filepath=\"./data/PETA/images/\"\n",
    "process_images = True\n",
    "output_mat_filepath = './data/PETA/PETA.mat'\n",
    "\n",
    "prev_peta_filepath = \"./PETA_old.mat\"\n",
    "# Change here #"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "assert(os.path.exists(labels_text_filepaths)), \"We need the updated labels in the 3DPeS.txt, CAVIAR4REID.txt, ... format\"\n",
    "assert(os.path.exists(peta_dataset_filepaths)), \"Please download the original PETA data with the following file format: PETA dataset\\{3DPeS, CAVAIR4REID, ...}\\archive, it can be found at http://mmlab.ie.cuhk.edu.hk/projects/PETA.html#:~:text=The%20PETA%20dataset%20consists%20of,%23Images\"\n",
    "\n",
    "assert(os.path.exists(prev_peta_filepath)), \"Please download the original PETA.mat file and rename it to %s.\" %(prev_peta_filepath)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "if process_images and not os.path.exists(peta_images_output_filepath):\n",
    "    print(\"Creating peta image output filepath at %s\" %(peta_images_output_filepath))\n",
    "    os.makedirs(peta_images_output_filepath)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Putting all the attributes together"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_attributes = previous_attributes + added_attributes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_attributes1 = np.zeros((len(all_attributes),1), dtype=np.object)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_attributes1[:] = [[x] for x in all_attributes]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Process the labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['./Updated_Labels/3DPeS.txt', './Updated_Labels/CAVIAR4REID.txt', './Updated_Labels/CUHK.txt', './Updated_Labels/GRID.txt', './Updated_Labels/i-LID.txt', './Updated_Labels/MIT.txt', './Updated_Labels/PRID.txt', './Updated_Labels/SARC3D.txt', './Updated_Labels/TownCentre.txt', './Updated_Labels/VIPeR.txt']\n"
     ]
    }
   ],
   "source": [
    "\"\"\"\n",
    "# Note this doesn't work with mac/linux because of sometimes additional files, and its sorted differently\n",
    "all_label_filepath = glob.glob(labels_text_filepaths + \"*\")\n",
    "sorted(all_label_filepath)\n",
    "all_label_filepath\n",
    "\"\"\"\n",
    "all_label_filepath = [os.path.join(labels_text_filepaths, PETA_folder) + \".txt\" for PETA_folder in PETA_folders]\n",
    "print(all_label_filepath)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "attribute_labels_df = pd.DataFrame()\n",
    "\n",
    "dataset_id = 1\n",
    "for label_filepath in all_label_filepath:\n",
    "    #print(label_filepath)\n",
    "    with open(label_filepath, 'r') as f:\n",
    "        file = f.readlines()\n",
    "\n",
    "    df_tmp = pd.DataFrame(file, columns = ['temp']) \n",
    "    df_tmp['dataset_id'] = dataset_id\n",
    "    #print(df_tmp.shape)\n",
    "\n",
    "    attribute_labels_df = pd.concat([attribute_labels_df, df_tmp])\n",
    "    #print(df.shape)\n",
    "    \n",
    "    dataset_id += 1\n",
    "\n",
    "del df_tmp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(8707, 2)\n"
     ]
    }
   ],
   "source": [
    "print(attribute_labels_df.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                                                                                                                                                                                                                                                        temp  \\\n",
      "0  1 upperBodyBlue lowerBodyBlack hairBlack footwearWhite lowerBodyCasual lowerBodyJeans personalLess45 personalMale upperBodyCasual upperBodyShortSleeve upperBodyThinStripes upperBodyTshirt hairShort footwearShoes carryingMessengerBag accessoryNothing carryingBlack\\n   \n",
      "1  2 upperBodyRed lowerBodyGrey hairBrown footwearWhite lowerBodyCasual lowerBodyJeans personalFemale personalLess30 upperBodyCasual upperBodyShortSleeve upperBodyTshirt hairShort footwearShoes carryingNothing accessoryNothing\\n                                           \n",
      "2  3 upperBodyBlue lowerBodyBlack hairBlack footwearWhite lowerBodyCasual lowerBodyTrousers personalLess30 personalMale upperBodyCasual upperBodyShortSleeve upperBodyTshirt hairShort footwearShoes carryingNothing accessoryNothing\\n                                        \n",
      "3  4 upperBodyBlack upperBodyWhite lowerBodyBlack hairBlack footwearBlack lowerBodyCasual lowerBodyJeans personalLess30 personalMale upperBodyPlaid upperBodyShortSleeve hairShort footwearShoes carryingNothing accessoryNothing\\n                                            \n",
      "4  5 upperBodyBlack upperBodyWhite lowerBodyBlue footwearBrown lowerBodyCasual lowerBodyJeans personalLarger60 personalMale upperBodyPlaid upperBodyShortSleeve hairBald footwearShoes carryingNothing accessoryNothing\\n                                                      \n",
      "\n",
      "   dataset_id  \n",
      "0  1           \n",
      "1  1           \n",
      "2  1           \n",
      "3  1           \n",
      "4  1           \n"
     ]
    }
   ],
   "source": [
    "print(attribute_labels_df.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "attribute_labels_df[\"pedestrian_id\"] = attribute_labels_df.apply(lambda x: int(x[\"temp\"].split(\" \")[0].split(\".\")[0]), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Extract the one-hot-encoding based on the order in the all_attributes (to fit the original mat file)\n",
    "for attr in all_attributes:\n",
    "    attribute_labels_df[attr] = attribute_labels_df.apply(lambda x: 1 if attr in x[\"temp\"] else 0, axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "attribute_labels_df = attribute_labels_df.drop(columns=\"temp\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Process the images"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['./PETA dataset/3DPeS', './PETA dataset/CAVIAR4REID', './PETA dataset/CUHK', './PETA dataset/GRID', './PETA dataset/i-LID', './PETA dataset/MIT', './PETA dataset/PRID', './PETA dataset/SARC3D', './PETA dataset/TownCentre', './PETA dataset/VIPeR']\n"
     ]
    }
   ],
   "source": [
    "\"\"\"\n",
    "# Note this doesn't work with mac/linux because of sometimes additional files, and its sorted differently\n",
    "all_folders_filepath = glob.glob(peta_dataset_filepaths + \"*\")\n",
    "sorted(all_folders_filepath)\n",
    "all_folders_filepath\n",
    "\"\"\"\n",
    "\n",
    "all_folders_filepath = [os.path.join(peta_dataset_filepaths, PETA_folder) for PETA_folder in PETA_folders]\n",
    "print(all_folders_filepath)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Contains the img_id, dataset_id, pedestrian_id\n",
    "# img_id is a running number from 1 to n images\n",
    "# dataset_id 1 to 10 based on the dataset, so 1 is 3DPeS, 10 is VIPeR, etc etc\n",
    "# pedestrian_id is taken from the image name from the original PETA dataset\n",
    "\"\"\"\n",
    "['./PETA dataset\\\\3DPeS',\n",
    " './PETA dataset\\\\CAVIAR4REID',\n",
    " './PETA dataset\\\\CUHK',\n",
    " './PETA dataset\\\\GRID',\n",
    " './PETA dataset\\\\i-LID',\n",
    " './PETA dataset\\\\MIT',\n",
    " './PETA dataset\\\\PRID',\n",
    " './PETA dataset\\\\SARC3D',\n",
    " './PETA dataset\\\\TownCentre',\n",
    " './PETA dataset\\\\VIPeR']\n",
    "\"\"\"\n",
    "# for example if the naming is \"100_3_FRAME_26_RGB.bmp\" then the pedestrian_id is 100, this is to map to the attribute_labels_df\n",
    "\n",
    "partial_ids_df = pd.DataFrame(columns = ['img_id', 'dataset_id', 'pedestrian_id'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "img_id = 1\n",
    "\n",
    "for i in range(len(all_folders_filepath)):\n",
    "    images_filepath = glob.glob(all_folders_filepath[i] + \"/archive/*[!txt]\")\n",
    "    for image_path in images_filepath:\n",
    "        #ID = image_path.split(\"\\\\\")[-1].split(\"_\")[0].split(\".\")[0] \n",
    "        ID = os.path.split(image_path)[-1].split(\"_\")[0].split(\".\")[0] # for compatibility btw windows/mac/linux\n",
    "        #print(ID)\n",
    "        \n",
    "        if process_images:\n",
    "            img = Image.open(image_path)\n",
    "            img.save(peta_images_output_filepath + str(img_id).zfill(5)+\".png\")\n",
    "        \n",
    "        partial_ids_df.loc[img_id-1] = [int(img_id), int(i+1), int(ID)]\n",
    "        img_id += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(19000, 3)\n"
     ]
    }
   ],
   "source": [
    "print(partial_ids_df.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "partial_ids_df = partial_ids_df.astype('int32')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "tmp = partial_ids_df[[\"dataset_id\", \"pedestrian_id\"]].drop_duplicates().sort_values(by=[\"dataset_id\", \"pedestrian_id\"]).reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "# So apparently there is also a unique ID for the dataset & pedestrian together.\n",
    "tmp[\"dataset_pedestrian_id\"] = tmp.index + 1\n",
    "tmp = tmp.drop(columns=\"index\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "full_ids_df = pd.merge(partial_ids_df, tmp, on=[\"dataset_id\", \"pedestrian_id\"], how=\"inner\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(19000, 4)\n"
     ]
    }
   ],
   "source": [
    "print(full_ids_df.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "del tmp"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Combine the images and the labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "attributes_n_ids_df = pd.merge(full_ids_df, attribute_labels_df, on=[\"dataset_id\", \"pedestrian_id\"], how=\"inner\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(19000, 120)\n"
     ]
    }
   ],
   "source": [
    "print(attributes_n_ids_df.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Copy the train/test split from the original PETA.mat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = sio.loadmat(prev_peta_filepath)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Rearrange the columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "ID_cols = [\"img_id\", \"dataset_pedestrian_id\", \"dataset_id\", \"pedestrian_id\"]\n",
    "\n",
    "rearranged_columns = ID_cols + all_attributes\n",
    "\n",
    "attributes_n_ids_df = attributes_n_ids_df[rearranged_columns]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Combine it all together "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "attribute_ids_n_values = np.array(attributes_n_ids_df.values) #, dtype='uint16')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Just look for last minute QC\n",
    "#attribute_ids_n_values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "attribute_names = [[np.array([x], dtype=\"U\")] for x in all_attributes]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "non_id_start_index = len(ID_cols) + 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "attribute_train_ids = np.array([list(range(non_id_start_index, len(all_attributes) + non_id_start_index))])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_final = {'peta': {'data':attribute_ids_n_values,\n",
    "                       'attribute':all_attributes1,\n",
    "                       'selected_attribute':attribute_train_ids,\n",
    "                       'partion_attribute':data['peta'][0][0][3]}} # this comes from the original PETA.mat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "sio.savemat(output_mat_filepath, data_final)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "# output_mat_filepath"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "# loaddata = sio.loadmat(output_mat_filepath)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "# loaddata['peta'][0][0][0][:, 4:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "# loaddata['peta'][0][0][0].shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "# loaddata['peta'][0][0][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "# loaddata['peta'][0][0][0][18990]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import pickle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "# with open('./data/PETA/dataset.pkl', 'rb') as f:\n",
    "#     x = pickle.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "# x.__dict__.keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "# x.label"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
